1

library(datasets) # Load iris data
library(ggplot2)
library(nnet) # For multinomial logistic regression
library(naivebayes)
## naivebayes 0.9.7 loaded
library(adabag) # For AdaBoost
## Loading required package: rpart
## Loading required package: caret
## Loading required package: lattice
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(rpart) # For AdaBoost
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble  3.1.5     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x purrr::accumulate() masks foreach::accumulate()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()
## x purrr::lift()       masks caret::lift()
## x purrr::when()       masks foreach::when()
library(ggpubr)
library(caret)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

1a Boxplots, Correlation Matrix, Scatter Plots of Features Pairs:

data <- iris

#Boxplots
boxplot(data$Sepal.Length, main = "Sepal Length")

boxplot(data$Sepal.Width, main = "Sepal Width")

boxplot(data$Petal.Length, main = "Petal Length")

boxplot(data$Petal.Width, main = "Petal Width")

#boxplot(data$Species)

#Correlation Matrix
cor(data[1:4])
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1175698    0.8717538   0.8179411
## Sepal.Width    -0.1175698   1.0000000   -0.4284401  -0.3661259
## Petal.Length    0.8717538  -0.4284401    1.0000000   0.9628654
## Petal.Width     0.8179411  -0.3661259    0.9628654   1.0000000
#scatterplots
ggplot(iris, aes(x=Sepal.Length, y=Sepal.Width, color = Species)) + geom_point()

ggplot(iris, aes(x=Sepal.Length, y=Petal.Length, color = Species)) + geom_point()

ggplot(iris, aes(x=Sepal.Length, y=Petal.Width, color = Species)) + geom_point()

ggplot(iris, aes(x=Sepal.Width, y=Petal.Length, color = Species)) + geom_point()

ggplot(iris, aes(x=Sepal.Width, y=Petal.Width, color = Species)) + geom_point()

ggplot(iris, aes(x=Petal.Length, y=Petal.Width, color = Species)) + geom_point()

1b:

Use X1, X2, X3 to fit the multinomial logistic regression, the Naive Bayes classifier, and the AdaBoost with 30 Decision Tree classifiers with maximum depth 3. Plot the decision surfaces of the 3 classifiers on (X1, X2) and (X1, X3) respectively.

# Fit a multinomial logistic regression
multinom_model <- multinom(Species ~ Sepal.Length + Sepal.Width + Petal.Length, data=iris)
## # weights:  15 (8 variable)
## initial  value 164.791843 
## iter  10 value 21.394999
## iter  20 value 13.010468
## iter  30 value 11.932227
## iter  40 value 11.899523
## iter  50 value 11.886536
## final  value 11.886441 
## converged
# Generate grid points for plotting decision surfaces
r <- sapply(iris[, c(1,2,3)], range, na.rm = TRUE)
x1s <- seq(r[1,1], r[2,1], length.out = 500)
x3s <- seq(r[1,2], r[2,2], length.out = 500)
x4s <- seq(r[1,3], r[2,3], length.out = 500)
grid <- cbind(rep(x1s, each=500), rep(x3s, time = 500), rep(x4s, time = 500))
colnames(grid) <- colnames(r)
grid <- as.data.frame(grid)

# Predictions from the generated grid points
multinom_pred <- predict(multinom_model, newdata=grid, type="class")


# For Naive Bayes, simply replace the lines for model fitting and prediction with:
nb_model <- naive_bayes(Species ~ Sepal.Length + Sepal.Width + Petal.Length, data = iris)
nb_pred <- predict(nb_model, grid, type = "class")

# Plot decision surfaces for (X1, X2)
mult_ds1 <- ggplot() + 
  geom_point(data = iris, aes(x=Sepal.Length, y=Sepal.Width, color = Species), size = 2, show.legend = F) +
  geom_raster(alpha=0.1, aes(x = grid[, 1],y = grid[, 2], fill=multinom_pred), show.legend = F) +
  theme_bw()

nb_ds1 <- ggplot() + 
  geom_point(data = iris, aes(x=Sepal.Length, y=Sepal.Width, color = Species), size = 2, show.legend = F) +
  geom_raster(alpha=0.1, aes(x = grid[, 1],y = grid[, 2], fill=nb_pred), show.legend = F) +
  theme_bw()

boost_ds1 <- ggplot() + 
  geom_point(data = iris, aes(x=Sepal.Length, y=Sepal.Width, color = Species), size = 2, show.legend = F) +
  geom_raster(alpha=0.1, aes(x = grid[, 1],y = grid[, 2], fill=boost_pred), show.legend = F) +
  theme_bw()



# For AdaBoost:
boost_model <- boosting(Species ~ Sepal.Length + Sepal.Width + Petal.Length,
data = iris, mfinal = 30, control = rpart.control(maxdepth = 3))
boost_pred <- predict(boost_model, grid, type = "class")$class


ggarrange(mult_ds1, nb_ds1, boost_ds1, ncol = 2, nrow = 2)

1c:

Based on the decision surfaces, compare and summarize the advantage and disadvantage of each classifier.

2a:

Creating correlated errors through time. Please write a function that takes in an integer, n ≥1, then produces “errors” that are generated in the following way:

generate_errors <- function(n){
  errors <- c()
  for (i in 1:n) {
    if(i == 1){
      errors <- c(errors, rnorm(1, 0, 1))
    }else{
      errors <- c(errors, rnorm(1, mean = last(errors), sd = 1))
    }
  }
  return(errors)
}

2b

#generate values
corr1 <- generate_errors(500)
rep1 <- rep(1, 500)
corr2 <- generate_errors(500)
rep2 <- rep(2, 500)
corr3 <- generate_errors(500)
rep3 <- rep(3, 500)
corr4 <- generate_errors(500)
rep4 <- rep(4, 500)
corr5 <- generate_errors(500)
rep5 <- rep(5, 500)

#create a df
df <- data_frame(corr1, rep1)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
df2 <- data_frame(corr2, rep2)
df3 <- data_frame(corr3, rep3)
df4 <- data_frame(corr4, rep4)
df5 <- data_frame(corr5, rep5)
names(df) <- c("cor", "trial")
names(df2) <- c("cor", "trial")
names(df3) <- c("cor", "trial")
names(df4) <- c("cor", "trial")
names(df5) <- c("cor", "trial")

final_df <- rbind(df, df2, df3, df4, df5)
final_df$index <- rep(1:2500)

library(RColorBrewer)

#correlated errors plot
cor_plot <- ggplot(final_df, aes(x=index, y = cor, color = trial)) + geom_point()+ scale_fill_brewer(palette="Dark2")


#uncorrelated errors
uncorr1 <- rnorm(500, 0, 1)
unrep1 <- rep(1, 500)
uncorr2 <- rnorm(500, 0, 1)
unrep2 <- rep(2, 500)
uncorr3 <- rnorm(500, 0, 1)
unrep3 <- rep(3, 500)
uncorr4 <- rnorm(500, 0, 1)
unrep4 <- rep(4, 500)
uncorr5 <- rnorm(500, 0, 1)
unrep5 <- rep(5, 500)

#create df
df <- data_frame(uncorr1, unrep1)
df2 <- data_frame(uncorr2, unrep2)
df3 <- data_frame(uncorr3, unrep3)
df4 <- data_frame(uncorr4, unrep4)
df5 <- data_frame(uncorr5, unrep5)
names(df) <- c("cor", "trial")
names(df2) <- c("cor", "trial")
names(df3) <- c("cor", "trial")
names(df4) <- c("cor", "trial")
names(df5) <- c("cor", "trial")

uncor_df <- rbind(df, df2, df3, df4, df5)
uncor_df$index <- rep(1:2500)

#uncorrelated errors plot
uncor_plot <- ggplot(uncor_df, aes(x=index, y = cor, color = trial)) + geom_point()+ scale_fill_brewer(palette="Dark2")

#show graphs
cor_plot

uncor_plot

rnoise <- function(n) {
  eps <- vector(length=n)
  eps[1] <- rnorm(1, 0, 1)
  for (i in 2:n){
    #eps[i]<- rnorm(1, eps[i-1], 1)
    eps[i]<- eps[i-1] + rnorm(1, 0, 1)
  }
  return(eps)
}


num_simulation <- 5
correlated_noise <- replicate(num_simulation, rnoise(n = 500)) 
independent_noise <- replicate(num_simulation, rnorm(n = 500))

par(mfrow=c(1,2))
matplot(correlated_noise, type="l", lty = 1, pch=19, ylab="noise", xlab="n", main="Correlated")
matplot(independent_noise, type="l", lty = 1, pch=19, ylab="noise", xlab="n", main="Independent")

2c

Comment on the difference. According to the plots you’ve created, please answer the following:

Do both types of errors have mean 0 overall (1 sentence)? What could you do to be more certain (1 sentence)?

The uncorrelated errors do have mean 0 overall, while the uncorrelated errors definitely do not. We could calculate the average column values in the dataframe and get this information quickly.

Uncorrelated:

mean(uncor_df$cor)
## [1] -0.0006140347

Correlated:

mean(final_df$cor)
## [1] -0.4143909

We see that the correlated has a mean of 1.39 (across each group) and that the uncorrelated errors have a near zero mean of -0.001236516.

Which type of errors has a larger variance or are they comparable?

var(final_df$cor)
## [1] 217.82
var(uncor_df$cor)
## [1] 1.009564

The correlated errors have a variance of 477.7241 while the uncorrelated errors have a variance of 1.003073. This makes sense as the uncorrelated errors are normally distributed and should have a standard deviation of about 1, and the standard deviation is the square root of the variance.

The uncorrelated errors definitely have a higher variance.

2d

How correlated errors affect the ordinary least squared regression. Please simulate data from the usual linear model Yi = β0 + β1xi + ϵi, but now ϵi are generated from your correlated function as in part (a). Let x1, x2, . . . , xn be evenly spaced values between 0 and 10 in increments of 0.2 (inclusive of bounds). Set β0 = 1, β1 = 2. At each value of xi, we only observe one yi. For each dataset generated, please fit the OLS and store: • the fitted parameters; • the associated mean and SE for the parameters in the output of summary.lm(). Please create 1000 simulated datasets and estimate the mean and SE of the regression coefficients. Do the mean and SE based on the simulated values “overall agree” with the values from summary.lm()? Please explain your answer. (Hint: Please refer to the R Hints.)

generate_regression <- function(beta) {
X <- cbind(1, seq(0, 10, by = 0.2))
# You should write your own rnoise() for generating correlated errors in part (a)
Y <- X %*% beta + generate_errors(nrow(X))
return( lm(Y ~ X[,2]) ) # Return an object
}

regressions <- replicate(1000, generate_regression(c(1,2)), simplify = FALSE)
# First row will be intercept and the second row will be slope.
fitted_parameters <- sapply(regressions, coef)
computed_mean <- sapply(regressions, function(obj) summary(obj)$coefficients[,"Estimate"])
computed_se <- sapply(regressions, function(obj) summary(obj)$coefficients[,"Std. Error"])
estimated_mean <- apply(fitted_parameters, 1, mean)
estimated_se <- apply(fitted_parameters, 1, sd)

2e

Which of our usual regression conditions are satisfied in our simulation above? Please also state why the conditions are violated:

Linearity is not violated, because this is how the data was generated. As shown in the math above, the errors have 0 conditional expectation. Since we used the rnorm() function to generate the data, the errors are normally distributed. However, crucially the errors are not indendent. They are dependent, as the previous error is used to calculate the next error, as shown in the code above. The errors do not have a constant variance, violating the homoscedasticity assumption. As the sample size increases, the errors also increase, as shown above.

Project Question Series 3

Formulate a statistical inference problem that will help your instructor to predict the travel time of future trips.:

Propose a detailed analytical pipeline to solve the statistical inference problem:

Design the experiment to evaluate the effectiveness of your approach:

flights <- read.csv("pnwflights14.csv")

# Remove NAs
flights <- na.omit(flights)

# Remove nonsense entries 
flights <- flights[-which(flights$dep_time < 0),]
flights <- flights[-which(flights$air_time < 0),]
flights <- flights[-which(flights$distance < 0),]




#convert the timing into a date time object
flights$date <- paste(flights$year, "-", flights$month, "-", flights$day, " ", flights$hour, ":", flights$minute, sep = "")

flights$dep_date <- strptime(flights$date, format="%Y-%m-%d %H:%M")

flights2 <- flights


flights2$arr_time2 <- substr(as.POSIXct(sprintf("%04.0f", flights$arr_time), format='%H%M'), 12, 16)

flights2$arr_date <- paste(flights$year, "-", flights$month, "-", flights$day, " ", flights2$arr_time2, sep = "")

flights2$arr_date <- strptime(flights2$arr_date, format="%Y-%m-%d %H:%M")

flights$arr_date <- flights2$arr_date

Formulate a statistical inference problem that will help your instructor to predict the travel time of future trips.:

avg_carrier_delays <- flights %>%
  group_by(carrier) %>%
  summarise(mean_delay= mean(arr_delay), n = n()) %>%
  arrange(desc(mean_delay))

avg_carrier_delays

The goal of this problem is to predict the travel time of a future trip. Above I calculated the mean arrival delay between flight carriers. I want to see if there is a true difference in the mean arrival delay between the carrier with the lowest arrival delay and the carrier with the highest arrival delay. This will be useful in determining which carrier is the best to pick to minimize arrival delays.

Propose a detailed analytical pipeline to solve the statistical inference problem:

To evaluate if one carrier is better than another carrier, I will conduct a difference in means test. I will then also create a simple linear regression model, to see if other variables better explain the difference in arrival time than carrier.

Design the experiment to evaluate the effectiveness of your approach:

Pre processing:

flights <- read.csv("/Users/nikhil/Google Drive/My Drive/STAT 3105/HW4/pnwflights14.csv", header = TRUE)

# Remove NAs
flights <- na.omit(flights)

# Remove nonsense entries 
flights <- flights[-which(flights$dep_time < 0),]
flights <- flights[-which(flights$air_time < 0),]
flights <- flights[-which(flights$distance < 0),]


#convert the timing into a date time object
flights$date <- paste(flights$year, "-", flights$month, "-", flights$day, " ", flights$hour, ":", flights$minute, sep = "")

flights$dep_date <- strptime(flights$date, format="%Y-%m-%d %H:%M")

flights2 <- flights


flights2$arr_time2 <- substr(as.POSIXct(sprintf("%04.0f", flights$arr_time), format='%H%M'), 12, 16)

flights2$arr_date <- paste(flights$year, "-", flights$month, "-", flights$day, " ", flights2$arr_time2, sep = "")

flights2$arr_date <- strptime(flights2$arr_date, format="%Y-%m-%d %H:%M")
flights$arr_date <- flights2$arr_date

#recode categorical vars
flights$month <- as.factor(flights$month)
flights$year <- as.factor(flights$year)
flights$day <- as.factor(flights$day)

#There are 3007 tail nos and 145459 observations, so likely corresponds to type of aircraft
flights$tailnum <- as.factor(flights$tailnum)
flights$origin <- as.factor(flights$origin)
flights$dest <- as.factor(flights$dest)
flights$flight <- as.factor(flights$flight)

Regression

Now we will do some exploratory analysis, and make a simple model and try to identify useful features.

plot(flights$arr_date, flights$arr_delay, main = "Arrival Date vs Arrival Delay")

plot(flights$dep_date, flights$arr_delay, main = "Departure Date vs Arrival Delay")

#These variables are highly correlated
plot(flights$dep_delay, flights$arr_delay)

#try and look for collinearity

flights2 <- select(flights, dep_delay, arr_time, arr_delay, air_time, distance)

cor(flights2) 
##              dep_delay    arr_time   arr_delay     air_time     distance
## dep_delay  1.000000000  0.05626588  0.92351496 -0.008110625 -0.003219088
## arr_time   0.056265885  1.00000000  0.06920322 -0.042684748 -0.048964531
## arr_delay  0.923514961  0.06920322  1.00000000 -0.023384820 -0.054510499
## air_time  -0.008110625 -0.04268475 -0.02338482  1.000000000  0.985321253
## distance  -0.003219088 -0.04896453 -0.05451050  0.985321253  1.000000000

From our correlation matrix, we notice a high degree of correlation between arr_delay and dep_delay, as well as between distance and air_time. We will need to keep this in mind when designing our model, as collinearity violates the assumptions of linear regression.

Now we will remove unnecessary/duplicate features make a simple OLS model, and then evaluate it’s predictive accuracy:

flights2 <- select(flights, -c(month, day, year, date))


flights2$dep_date <- as.Date(flights2$dep_date)

#Test/Train Split
trainIndex <- createDataPartition(flights2$arr_delay, p = .8, 
                                  list = FALSE, 
                                  times = 1)
train <- flights2[trainIndex,]
test <- flights2[-trainIndex,]
#simple regressions
simple1 <- lm(arr_delay ~ air_time, data = train)
simple1_predict <- predict(simple1, test)
RMSE(simple1_predict, test$arr_delay)
## [1] 30.21482

RMSE = 31.3

simple2 <- lm(arr_delay ~ air_time+carrier+origin+distance+dest+dep_date, data = train)
simple2_predict <- predict(simple2, test)
RMSE(simple2_predict, test$arr_delay)
## [1] 28.88639

RMSE = 30.28

# Define training control
set.seed(123) 
train.control <- trainControl(method = "cv", number = 10)

# Train the model
model <- train(arr_delay ~ air_time+carrier+origin+distance+dest+dep_date,data = flights2, method = "lm",trControl = train.control)

#RMSE = 29.88867
print(model)
## Linear Regression 
## 
## 145459 samples
##      6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 130913, 130914, 130913, 130914, 130912, 130912, ... 
## Resampling results:
## 
##   RMSE      Rsquared    MAE     
##   29.88867  0.08199325  15.34389
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
simple1 <- lm(arr_delay ~ air_time, data = flights2)
simple2 <- lm(arr_delay ~ air_time+carrier+origin+distance+dest+dep_date, data = flights2)
simple3 <- lm(arr_delay ~ origin+dest+carrier+distance+dep_delay, data = flights2)
simple4 <- lm(arr_delay ~ origin+dest+carrier+distance+dep_delay+dep_date, data = flights2)
simple5 <- lm(arr_delay ~ origin+carrier+distance+dep_delay+dep_date, data = flights2)


summary(simple1)
summary(simple2)
summary(simple3)
simple2_predict <- predict(simple2, test)
RMSE(simple2_predict, test$arr_delay)
## [1] 28.88639
summary(simple2)
## 
## Call:
## lm(formula = arr_delay ~ air_time + carrier + origin + distance + 
##     dest + dep_date, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -57.01  -12.97   -5.96    2.96 1544.36 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -33.579320  14.843899  -2.262 0.023689 *  
## air_time      0.819340   0.009545  85.836  < 2e-16 ***
## carrierAS    -3.498470   0.679959  -5.145 2.68e-07 ***
## carrierB6     8.115957   1.060463   7.653 1.98e-14 ***
## carrierDL     1.057128   0.810955   1.304 0.192386    
## carrierF9     5.728200   1.051792   5.446 5.16e-08 ***
## carrierHA     6.148580   1.440971   4.267 1.98e-05 ***
## carrierOO    -0.589468   0.778483  -0.757 0.448931    
## carrierUA    -3.493424   0.714200  -4.891 1.00e-06 ***
## carrierUS    -4.840288   0.999994  -4.840 1.30e-06 ***
## carrierVX    -5.639227   0.945940  -5.962 2.51e-09 ***
## carrierWN     3.907996   0.758773   5.150 2.60e-07 ***
## originSEA     3.880219   0.295462  13.133  < 2e-16 ***
## distance     -0.111328   0.003416 -32.591  < 2e-16 ***
## destANC      -9.434850   1.580866  -5.968 2.41e-09 ***
## destATL      23.746277   3.541739   6.705 2.03e-11 ***
## destAUS      13.031774   2.827907   4.608 4.06e-06 ***
## destBLI     -13.645069   5.320922  -2.564 0.010336 *  
## destBNA      10.226522   4.016265   2.546 0.010889 *  
## destBOI     -16.435050   4.514015  -3.641 0.000272 ***
## destBOS      28.447960   4.543318   6.261 3.83e-10 ***
## destBUR      -9.623052   1.693521  -5.682 1.33e-08 ***
## destBWI      24.441247   4.327699   5.648 1.63e-08 ***
## destBZN     -21.197221  21.464053  -0.988 0.323366    
## destCLE      41.953788   6.052034   6.932 4.17e-12 ***
## destCLT      29.342556   3.972325   7.387 1.51e-13 ***
## destCOS       0.278533   2.286084   0.122 0.903027    
## destCVG      13.417821   4.057497   3.307 0.000944 ***
## destDCA      25.249663   4.094159   6.167 6.97e-10 ***
## destDEN      -0.120057   1.363113  -0.088 0.929817    
## destDFW      15.002323   2.096219   7.157 8.30e-13 ***
## destDTW      17.832479   2.856795   6.242 4.33e-10 ***
## destEUG     -19.916251   3.757198  -5.301 1.15e-07 ***
## destEWR      30.043377   4.215103   7.128 1.03e-12 ***
## destFAI      -5.977125   1.957491  -3.053 0.002263 ** 
## destFAT      -9.466548   2.293133  -4.128 3.66e-05 ***
## destFLL      30.589430   5.413095   5.651 1.60e-08 ***
## destGEG     -14.235325   3.433494  -4.146 3.39e-05 ***
## destHDN       2.770253   6.235607   0.444 0.656853    
## destHNL       1.524078   4.995720   0.305 0.760308    
## destHOU      15.021542   3.638335   4.129 3.65e-05 ***
## destIAD      30.382142   3.981412   7.631 2.35e-14 ***
## destIAH      17.099249   2.615067   6.539 6.23e-11 ***
## destJAC      -9.059583   9.789101  -0.925 0.354720    
## destJFK      29.837914   4.303215   6.934 4.12e-12 ***
## destJNU     -12.364542   1.820744  -6.791 1.12e-11 ***
## destKOA      -1.939786   5.123799  -0.379 0.704998    
## destKTN     -13.704115   2.281001  -6.008 1.88e-09 ***
## destLAS     -10.647381   1.645430  -6.471 9.78e-11 ***
## destLAX      -7.734320   1.507715  -5.130 2.90e-07 ***
## destLGB     -20.293838   1.695910 -11.966  < 2e-16 ***
## destLIH      -2.327093   5.200129  -0.448 0.654510    
## destLMT     -29.951326   4.227893  -7.084 1.41e-12 ***
## destMCI       7.497107   2.016478   3.718 0.000201 ***
## destMCO      27.138619   4.842572   5.604 2.10e-08 ***
## destMDW       5.275071   2.389440   2.208 0.027270 *  
## destMIA      37.486776   5.474645   6.847 7.56e-12 ***
## destMKE       8.281312   2.782996   2.976 0.002924 ** 
## destMSP       5.214388   1.590726   3.278 0.001046 ** 
## destMSY       4.222769   3.982483   1.060 0.288994    
## destOAK     -12.061792   2.134747  -5.650 1.61e-08 ***
## destOGG      -0.828197   4.895149  -0.169 0.865650    
## destOMA       5.554038   2.309222   2.405 0.016167 *  
## destONT      -8.405240   1.689515  -4.975 6.54e-07 ***
## destORD      20.026661   2.242249   8.932  < 2e-16 ***
## destPDX     -23.056450   3.682913  -6.260 3.85e-10 ***
## destPHL      31.218660   4.206677   7.421 1.17e-13 ***
## destPHX      -2.866756   1.320319  -2.171 0.029914 *  
## destPSP      -9.569943   1.810868  -5.285 1.26e-07 ***
## destRDM     -19.035885   3.694111  -5.153 2.57e-07 ***
## destRNO      -8.310339   3.067706  -2.709 0.006750 ** 
## destSAN      -7.808575   1.400438  -5.576 2.47e-08 ***
## destSAT       6.567756   2.906068   2.260 0.023822 *  
## destSBA      -9.048387   2.084459  -4.341 1.42e-05 ***
## destSEA     -20.217979   3.494754  -5.785 7.26e-09 ***
## destSFO      -8.950743   2.153067  -4.157 3.22e-05 ***
## destSIT     -11.474127   3.819219  -3.004 0.002662 ** 
## destSJC     -14.805224   2.080129  -7.117 1.11e-12 ***
## destSLC     -10.270665   2.071547  -4.958 7.13e-07 ***
## destSMF      -8.079258   2.315200  -3.490 0.000484 ***
## destSNA     -13.101940   1.548602  -8.460  < 2e-16 ***
## destSTL      13.722940   2.606096   5.266 1.40e-07 ***
## destTPA       7.090775   5.134051   1.381 0.167243    
## destTUS      -4.145606   1.807512  -2.294 0.021819 *  
## dep_date      0.002728   0.000881   3.096 0.001961 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.16 on 116284 degrees of freedom
## Multiple R-squared:  0.08139,    Adjusted R-squared:  0.08073 
## F-statistic: 122.7 on 84 and 116284 DF,  p-value: < 2.2e-16
lrtest(simple1, simple2)
plot(simple2)

For this problem, I originally thought of using machine learning methods, as I thought that these would be able to make models with very high predictive accuracy. Interestingly I noticed that the model that I settled on had an RMSE of 30.028, whereas my best machine learning model using a 10 k fold split was only able to achieve an RMSE of 29.8.

This lead me to think about the problem in a new way, as the goal of this problem is statistical inference, and thus making a model that fits well is more important than making a model that predicts well. I eventually decided to use OLS as it would be easy to interpret, and I used variables that were likely to have an effect on arrival delay. I avoided using both air_time and distance, as these variables are highly correlated, and I did not want to violate the collinearity assumption.

I chose to fit an OLS regression on arr_delay, with the following variables as predictors: origin,dest,carrier,distance,dep_delay,dep_date. This model achieved an R^2 of 0.86, which is much higher than my simplest model’s R^2 of 0.081. Relative to my simplest model, we can also confirm using the likelihood ratio test that my model fits the data better (p < 0.0001).

From the cook’s vs distance plot, we can see that there are very few influential values, and from the Normal Q-Q plot, we can verify the normality assumption. Most of the variables in this model were statistical significant, with the exception of a few carriers and origin/destination airports.

What this model shows us, is that for this dataset, there exists a relationship between the predictors and arrival delay, and approximately 87% of the variability in the data can be explained by the model.

Difference in Means Test

df <- subset(flights, carrier == "F9" | carrier == "US")

df <- select(df, carrier, arr_delay)

hist(df$arr_delay[df$carrier == "F9"], main = "Distribution of F9 Carrier Arrival Delay",
     xlim = range(0:200), xlab = "Arrival Delay (minutes)")

hist(df$arr_delay[df$carrier == "US"], main = "Distribution of US Carrier Arrival Delay",
     xlim = range(0:200), xlab = "Arrival Delay (minutes)")

The conditions for conducting a difference in means test are that the samples are obtained randomly. I am not completely sure where this data came from but I will assume for now that the samples are obtained randomly.

Since each row represents a different flight, it is fair to assume that samples are independent, and that one flight does not affect another. In any case where it would like a certain accident on the runway causing delays of other flights, this variability will be mitigated by random sampling.

The histograms above do not appear to depict the sampling distribution as normal. However, since the sample size is greater than 40 for both airlines, we can meet the normality assumption.

t.test(arr_delay ~ carrier, data = df)
## 
##  Welch Two Sample t-test
## 
## data:  arr_delay by carrier
## t = 11.405, df = 3405.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F9 and group US is not equal to 0
## 95 percent confidence interval:
##   8.730665 12.355621
## sample estimates:
## mean in group F9 mean in group US 
##         9.201161        -1.341982

Our t test returned a p value of < 2.2e-16 for the null hypothesis that the true difference in means is equal to zero. Sample sizes were significantly large with 2411 F9 flights and 5319 US flights. This means that we can reject the null hypothesis, and confirm that the true difference in means is not zero. A 95% confidence interval for the true difference in means was [8.730665, 12.355621] and zero is notably not in the confidence interval.

Conclusion

Our regression model provided a coefficient of 5.669208 for having F9 as carrier and a coefficient of -4.472456 for having US as a carrier, and both coefficients were statistically. Our difference in means test showed that there is definitely a difference in the mean flight time between F9 and US. If trying to make a decision solely based on airline, the professor should definitely pick US airlines, as they are statistically likely to be early compared to other airlines.

However the following histogram shows the distribution of all of the regression coefficients:

hist(coef(simple2), xlab = "coefficient value", main = "Distribution of Regression coefficients")

From observing the distribution, we observe that 5.6 and -4.4 appear to be relatively in the center, meaning that they do not have as big of an effect on arrival time as other variables in the model. While it is likely to be the case that picking US airlines would decrease the flight time, other variables are likely to have a higher chance at decreasing flight time. For example, having Cleavland as a destination has a coefficient of 40, and is much more likely to increase flight time. Additionally, having LMT as a destination has a coefficient of -28 and is much more likely to decrease flight time.